library(tidyverse)
── Attaching core tidyverse packages ─────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster
Attaching package: ‘fastcluster’
The following object is masked from ‘package:stats’:
hclust
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
method from
print.paged_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘WGCNA’
The following object is masked from ‘package:stats’:
cor
library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
options(stringsAsFactors = FALSE)
load sample info
sample.description <- read.csv("../output/sample.description.gametophyte.all.csv")
load reads
lcpm <- read.csv("../output/gam_combined_log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 27206 71
Filter for genes with the highest coefficient of variation
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1
0.1511936 0.6073808
quantile(CV, 0.40)
40%
0.1498988
LFY2 has a pretty low CV; have to include 60% of the genes by CV to include it.
lcpm.filter <- lcpm[CV > quantile(CV, 0.4),]
dim(lcpm.filter)
[1] 16323 71
WGCNA wants genes in columns
lcpm.filter.t <- t(lcpm.filter)
Soft thresholding
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 20000)
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 16323 of 16323
Warning: executing %dopar% sequentially: no parallel backend registered
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
choose 5
softPower <- 5
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
define modules
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit <- 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3212 1387 1022 997 986 979 920 784 753 734 721 720 611 609 496
16 17 18 19 20 21 22
380 280 224 187 152 133 36
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
black blue brown cyan darkgreen darkred
920 1387 1022 609 36 133
green greenyellow grey60 lightcyan lightgreen lightyellow
986 721 280 380 224 187
magenta midnightblue pink purple red royalblue
753 496 784 734 979 152
salmon tan turquoise yellow
611 720 3212 997
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
merge similar modules
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
merge with correlation > 0.8
MEDissThres = 0.2
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.2
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 22 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 19 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 19 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
compare pre and post merge
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs
table(merge$colors)
black blue brown darkgreen darkred green
920 1996 1022 36 133 986
greenyellow grey60 lightcyan lightgreen lightyellow magenta
721 280 380 224 1166 1473
midnightblue pink purple royalblue salmon turquoise
496 784 734 152 611 3212
yellow
997
length(table(merge$colors))
[1] 19
median(table(merge$colors))
[1] 734
Which module is LFY in?
CrLFY1 <- "Ceric.33G031700"
CrLFY2 <- "Ceric.18G076300"
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)
module.assignment %>%
filter(str_detect(geneID, "18G076300|33G031700"))
Interesting: they are in different modules(!).
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
Plot eigengenes
Make sure sample info sheet is in the correct order.
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen.mean <- sample.eigen %>%
group_by(group) %>%
summarize(across(starts_with("ME"), mean),
label=unique(label))
sample.eigen %>%
pivot_longer(starts_with("ME"), names_to = "ME") %>%
ggplot(aes(y=label, x = value)) +
geom_point() +
facet_wrap(~ME, ncol=5) +
theme(axis.text.y = element_text(size = 6))
A heat map:
sample.eigen.mean$group %>% sort
[1] "CrLFY1-OX-gam"
[2] "CrLFY2-OX-gam"
[3] "Cross-gam"
[4] "PRJEB33372-mature gametophyte"
[5] "PRJNA1121398-5 DAG CrHAM KF hermaphrodite"
[6] "PRJNA1121398-5 DAG CrHAM KF male"
[7] "PRJNA1121398-5 DAG WT hermaphrodite"
[8] "PRJNA1121398-5 DAG WT male"
[9] "PRJNA1121398-8 DAG CrHAM KF hermaphrodite"
[10] "PRJNA1121398-8 DAG WT hermaphrodite"
[11] "PRJNA1149654-whole gametophyte DMSO"
[12] "PRJNA1149654-whole gametophyte IAA"
[13] "PRJNA248887-4.5 DAG gametophyte"
[14] "PRJNA248887-4.5 DAG gametophyte + antheridogen"
[15] "PRJNA651769-mature gametophyte"
[16] "PRJNA651770-immature gametophyte"
[17] "PRJNA681601-Cr_Herm"
[18] "PRJNA681601-Cr_Male"
[19] "RNAi-gam"
[20] "WT-male_gam"
[21] "WT-mature_herm"
sample.eigen.mean$label %>% sort
[1] "4.5 DAG gametophyte" "4.5 DAG gametophyte + antheridogen"
[3] "5 DAG CrHAM KF hermaphrodite" "5 DAG CrHAM KF male"
[5] "5 DAG WT hermaphrodite" "5 DAG WT male"
[7] "8 DAG CrHAM KF hermaphrodite" "8 DAG WT hermaphrodite"
[9] "Cr_Herm" "Cr_Male"
[11] "CrLFY1-OX-gam" "CrLFY2-OX-gam"
[13] "Cross-gam" "immature gametophyte"
[15] "mature gametophyte" "mature gametophyte"
[17] "RNAi-gam" "whole gametophyte DMSO"
[19] "whole gametophyte IAA" "WT-male_gam"
[21] "WT-mature_herm"
MEs.m <- sample.eigen.mean %>% mutate(label=make.names(label, unique = TRUE)) %>% column_to_rownames("label") %>% select(starts_with("ME")) %>% as.matrix()
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_gametophyte_all.Rdata")